/**
 * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
 * agreements. See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package edu.uci.ics.crawler4j.fetcher;

import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.GZIPInputStream;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpEntity;
import org.apache.http.HttpException;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.entity.HttpEntityWrapper;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.CoreProtocolPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParamBean;
import org.apache.http.protocol.HttpContext;
import org.apache.log4j.Logger;

import edu.uci.ics.crawler4j.crawler.AuthInfo;
import edu.uci.ics.crawler4j.crawler.AuthInfo.AuthenticationType;
import edu.uci.ics.crawler4j.crawler.Configurable;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;

/**
 * @author Yasser Ganjisaffar <lastname at gmail dot com>
 */
public class PageFetcher extends Configurable
{

    protected static final Logger logger = Logger.getLogger(PageFetcher.class);

    protected PoolingClientConnectionManager connectionManager;

    protected DefaultHttpClient httpClient;

    protected final Object mutex = new Object();

    protected long lastFetchTime = 0;

    protected IdleConnectionMonitorThread connectionMonitorThread = null;

    public PageFetcher(CrawlConfig config)
    {
        super(config);

        HttpParams params = new BasicHttpParams();
        HttpProtocolParamBean paramsBean = new HttpProtocolParamBean(params);
        paramsBean.setVersion(HttpVersion.HTTP_1_1);
        paramsBean.setContentCharset("UTF-8");
        paramsBean.setUseExpectContinue(false);

        params.setParameter(CoreProtocolPNames.USER_AGENT, config.getUserAgentString());
        params.setIntParameter(CoreConnectionPNames.SO_TIMEOUT, config.getSocketTimeout());
        params.setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, config.getConnectionTimeout());

        params.setBooleanParameter("http.protocol.handle-redirects", false);

        SchemeRegistry schemeRegistry = new SchemeRegistry();
        schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));

        if (config.isIncludeHttpsPages()) {
            schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));
        }

        connectionManager = new PoolingClientConnectionManager(schemeRegistry);
        connectionManager.setMaxTotal(config.getMaxTotalConnections());
        connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());
        httpClient = new DefaultHttpClient(connectionManager, params);

        if (config.getProxyHost() != null) {

            if (config.getProxyUsername() != null) {
                httpClient.getCredentialsProvider().setCredentials(
                        new AuthScope(config.getProxyHost(), config.getProxyPort()),
                        new UsernamePasswordCredentials(config.getProxyUsername(), config.getProxyPassword()));
            }

            HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
            httpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
        }

        httpClient.addResponseInterceptor(new HttpResponseInterceptor()
        {

            @Override
            public void process(final HttpResponse response, final HttpContext context) throws HttpException,
                    IOException
            {
                HttpEntity entity = response.getEntity();
                Header contentEncoding = entity.getContentEncoding();
                if (contentEncoding != null) {
                    HeaderElement[] codecs = contentEncoding.getElements();
                    for (HeaderElement codec : codecs) {
                        if (codec.getName().equalsIgnoreCase("gzip")) {
                            response.setEntity(new GzipDecompressingEntity(response.getEntity()));
                            return;
                        }
                    }
                }
            }

        });

        if (config.getAuthInfo() != null && !config.getAuthInfo().isEmpty()) {
            doAuthetication(config.getAuthInfo());
        }

        if (connectionMonitorThread == null) {
            connectionMonitorThread = new IdleConnectionMonitorThread(connectionManager);
        }
        connectionMonitorThread.start();

    }

    private void doAuthetication(List<AuthInfo> authInfos)
    {
        for (AuthInfo authInfo : authInfos) {
            if (authInfo.getAuthenticationType().equals(AuthenticationType.BASIC_AUTHENTICATION)) {
                doBasicLogin(authInfo);
            } else {
                doFormLogin(authInfo);
            }
        }
    }

    private void doBasicLogin(AuthInfo authInfo)
    {
        HttpHost targetHost = new HttpHost(authInfo.getHost(), authInfo.getPort(), authInfo.getScheme());
        httpClient.getCredentialsProvider().setCredentials(
                new AuthScope(targetHost.getHostName(), targetHost.getPort()),
                new UsernamePasswordCredentials(authInfo.getUsername(), authInfo.getPassword()));
    }

    private void doFormLogin(AuthInfo authInfo)
    {
        String fullUri = authInfo.getScheme() + "://" + authInfo.getHost() + ":" + authInfo.getPort()
                + authInfo.getLoginTarget();
        HttpPost httpPost = new HttpPost(fullUri);
        List<NameValuePair> formParams = new ArrayList<NameValuePair>();
        formParams.add(new BasicNameValuePair("username", authInfo.getUsername()));
        formParams.add(new BasicNameValuePair("password", authInfo.getPassword()));

        UrlEncodedFormEntity entity = null;
        try {
            entity = new UrlEncodedFormEntity(formParams, "UTF-8");
        } catch (UnsupportedEncodingException e) {
            logger.error("Encoding not supported", e);
        }

        httpPost.setEntity(entity);

        try {
            httpClient.execute(httpPost);
            logger.debug("Logged in with user " + authInfo.getUsername() + " to " + authInfo.getHost());
        } catch (ClientProtocolException e) {
            logger.error("Client protocol not supported", e);
        } catch (IOException e) {
            logger.error("Error making request", e);
        }
    }

    public PageFetchResult fetchHeader(WebURL webUrl)
    {
        PageFetchResult fetchResult = new PageFetchResult();
        String toFetchURL = webUrl.getURL();
        HttpGet get = null;
        try {
            get = new HttpGet(toFetchURL);
            synchronized (mutex) {
                long now = System.currentTimeMillis();
                if (now - lastFetchTime < config.getPolitenessDelay()) {
                    long waitingTime = config.getPolitenessDelay() - (now - lastFetchTime);
                    logger.info("Waiting " + waitingTime + " ms for politeness reasons to fetch " + toFetchURL);
                    Thread.sleep(waitingTime);
                }
                lastFetchTime = System.currentTimeMillis();
            }
            get.addHeader("Accept-Encoding", "gzip");
            HttpResponse response = httpClient.execute(get);
            fetchResult.setEntity(response.getEntity());

            // add headers
            Map<String, String> headers = new HashMap<String, String>();
            for (Header header : response.getAllHeaders()) {
                headers.put(header.getName(), header.getValue());
            }
            fetchResult.setHeaders(headers);

            int statusCode = response.getStatusLine().getStatusCode();
            if (statusCode != HttpStatus.SC_OK) {
                if (statusCode != HttpStatus.SC_NOT_FOUND) {
                    if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY) {
                        Header header = response.getFirstHeader("Location");
                        if (header != null) {
                            String movedToUrl = header.getValue();
                            movedToUrl = URLCanonicalizer.getCanonicalURL(movedToUrl, toFetchURL);
                            fetchResult.setMovedToUrl(movedToUrl);
                        }
                        fetchResult.setStatusCode(statusCode);
                        return fetchResult;
                    }
                    logger.info("Failed: " + response.getStatusLine().toString() + ", while fetching " + toFetchURL);
                }
                fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
                return fetchResult;
            }

            fetchResult.setFetchedUrl(toFetchURL);
            String uri = get.getURI().toString();
            if (!uri.equals(toFetchURL)) {
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
                    fetchResult.setFetchedUrl(uri);
                }
            }

            if (fetchResult.getEntity() != null) {
                long size = fetchResult.getEntity().getContentLength();
                if (size == -1) {
                    Header length = response.getLastHeader("Content-Length");
                    if (length == null) {
                        length = response.getLastHeader("Content-length");
                    }
                    if (length != null) {
                        size = Integer.parseInt(length.getValue());
                    } else {
                        size = -1;
                    }
                }
                if (size > config.getMaxDownloadSize()) {
                    fetchResult.setStatusCode(CustomFetchStatus.PageTooBig);
                    get.abort();
                    return fetchResult;
                }

                fetchResult.setStatusCode(HttpStatus.SC_OK);
                return fetchResult;

            } else {
                get.abort();
            }
        } catch (IOException e) {
            logger.error("Fatal transport error: " + e.getMessage() + " while fetching " + toFetchURL
                    + " (link found in doc #" + webUrl.getParentDocid() + ")");
            fetchResult.setStatusCode(CustomFetchStatus.FatalTransportError);
            return fetchResult;
        } catch (IllegalStateException e) {
            // ignoring exceptions that occur because of not registering https
            // and other schemes
        } catch (Exception e) {
            if (e.getMessage() == null) {
                logger.error("Error while fetching " + webUrl.getURL());
            } else {
                logger.error(e.getMessage() + " while fetching " + webUrl.getURL());
            }
        } finally {
            try {
                if (fetchResult.getEntity() == null && get != null) {
                    get.abort();
                }
            } catch (Exception e) {
                logger.error("Exception in finally block while aborting the current page fetch.", e);
            }
        }
        fetchResult.setStatusCode(CustomFetchStatus.UnknownError);
        return fetchResult;
    }

    public synchronized void shutDown()
    {
        if (connectionMonitorThread != null) {
            connectionManager.shutdown();
            connectionMonitorThread.shutdown();
        }
    }

    public HttpClient getHttpClient()
    {
        return httpClient;
    }

    private static class GzipDecompressingEntity extends HttpEntityWrapper
    {

        public GzipDecompressingEntity(final HttpEntity entity)
        {
            super(entity);
        }

        @Override
        public InputStream getContent() throws IOException, IllegalStateException
        {

            // the wrapped entity's getContent() decides about repeatability
            InputStream wrappedin = wrappedEntity.getContent();

            return new GZIPInputStream(wrappedin);
        }

        @Override
        public long getContentLength()
        {
            // length of unzipped content is not known
            return -1;
        }

    }
}
